Reading the data

loving_daily <- readRDS("../DataProcessing/Trailer_daily_merge.rds")
loving_hourly <- readRDS("../DataProcessing/Trailer_hourly_merge.rds")
# making NA count to 0
loving_daily$count[is.na(loving_daily$count)] <- 0
loving_hourly$count[is.na(loving_hourly$count)] <- 0
loving_daily$weighted.count[is.na(loving_daily$weighted.count)] <- 0
loving_hourly$weighted.count[is.na(loving_hourly$weighted.count)] <- 0
# none flare within 100km
loving_daily$closest.flare[is.na(loving_daily$closest.flare)] <- 100
loving_hourly$closest.flare[is.na(loving_hourly$closest.flare)] <- 100
# creating a dataframe without the flares data
loving_daily_noflares <- loving_daily %>% select(-c(temp_bb,rhi, esf_bb, distToLovi,inv_dist))
loving_hourly_noflares <- loving_hourly %>% select(-c(temp_bb,rhi, esf_bb, distToLovi,inv_dist))
# Helper Function to find correlated variables above a threshold
correlated_above_threshold <- function(cor_matrix, var_name, threshold, response) {
  # Get the absolute values of correlations for the variable
  cor_values <- abs(cor_matrix[var_name, ])
  
  # Find the variables above the threshold
  above_threshold <- cor_values >= threshold
  
  # Get the names of the correlated variables above the threshold
  correlated_names <- setdiff(names(above_threshold[above_threshold]), "rd_particle_pCi")
  
  # Get the correlation scores with response
  correlated_scores<- cor_matrix[response, correlated_names]
  
  return (list(variables = correlated_names, scores = correlated_scores))
}

Modeling

Fitting on Daily

# First we do it for daily

# Define the response variable
response_var <- "rd_particle_pCi_mean"

# Define the names of the columns to exclude
exclude_cols <- c("rd_particle_pCi_mean", "rd_particle_B_mean", "radon_pCi_mean", "radon_B_mean", 
                  "temp_bb", "rhi", "esf_bb", "distToLovi", "monthly_oil", "monthly_gas", "distToLovi_wells",
                  "inv_dist", "datetime")

# Generate the formula for the GAM model
formula_str_s <- paste(response_var, "~", paste(paste0("s(", setdiff(names(loving_daily), exclude_cols), ")"), collapse = "+"))
formula_str <- paste(response_var, "~", paste(setdiff(names(loving_daily), exclude_cols), collapse = "+"))


# Fit the GAM model 
daily_v1 <- mgcv::gam(as.formula(paste0(formula_str, "+s(as.numeric(datetime))")), data = loving_daily_noflares)

# View the summary of the GAM model
summary(daily_v1)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## rd_particle_pCi_mean ~ co_mean + no_mean + no2_mean + nox_mean + 
##     o3_mean + temp_f_mean + pressure_altcorr_mean + wsp_mean + 
##     wdr_mean + relh_mean + rain_mean + solr_mean + co2_ppm_mean + 
##     ch4_mean + h2o_sync_mean + ethane_mean + ethene_mean + propane_mean + 
##     propene_mean + X1_3.butadiene_mean + i.butane_mean + n.butane_mean + 
##     acetylene_mean + cyclopentane_mean + i.pentane_mean + n.pentane_mean + 
##     n.hexane_mean + isoprene_mean + n.heptane_mean + benzene_mean + 
##     n.octane_mean + toluene_mean + ethyl.benzene_mean + m.p.xylene_mean + 
##     o.xylene_mean + closest.flare + weighted.count + count + 
##     s(as.numeric(datetime))
## 
## Parametric coefficients:
##                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           -1.295e+00  1.962e+00  -0.660  0.51092   
## co_mean                5.726e-04  2.288e-04   2.502  0.01411 * 
## no_mean                6.996e-01  1.141e+00   0.613  0.54136   
## no2_mean               7.009e-01  1.142e+00   0.614  0.54083   
## nox_mean              -6.998e-01  1.141e+00  -0.613  0.54117   
## o3_mean               -1.999e-03  7.262e-04  -2.753  0.00713 **
## temp_f_mean            5.671e-04  1.732e-03   0.327  0.74406   
## pressure_altcorr_mean  1.114e-03  1.791e-03   0.622  0.53532   
## wsp_mean              -7.741e-03  5.335e-03  -1.451  0.15024   
## wdr_mean               2.893e-04  1.680e-04   1.722  0.08845 . 
## relh_mean             -2.417e-04  1.339e-03  -0.180  0.85720   
## rain_mean             -5.656e-01  8.385e-01  -0.675  0.50168   
## solr_mean             -7.362e-05  9.068e-05  -0.812  0.41893   
## co2_ppm_mean           6.934e-05  1.637e-03   0.042  0.96631   
## ch4_mean               8.629e-05  2.981e-05   2.894  0.00475 **
## h2o_sync_mean         -1.660e-02  3.394e-02  -0.489  0.62585   
## ethane_mean            3.654e-04  8.158e-04   0.448  0.65532   
## ethene_mean           -1.215e-02  8.301e-03  -1.463  0.14681   
## propane_mean           3.276e-04  1.430e-03   0.229  0.81927   
## propene_mean          -2.276e-01  2.507e-01  -0.908  0.36649   
## X1_3.butadiene_mean    3.360e+00  2.200e+00   1.527  0.13017   
## i.butane_mean          5.420e-03  7.689e-03   0.705  0.48267   
## n.butane_mean         -1.787e-03  5.682e-03  -0.314  0.75393   
## acetylene_mean         2.676e-02  6.328e-02   0.423  0.67340   
## cyclopentane_mean      2.538e-01  2.242e-01   1.132  0.26047   
## i.pentane_mean        -1.477e-03  2.158e-02  -0.068  0.94558   
## n.pentane_mean        -8.861e-03  3.514e-02  -0.252  0.80150   
## n.hexane_mean         -3.610e-02  8.036e-02  -0.449  0.65434   
## isoprene_mean          1.064e+00  1.357e+00   0.784  0.43523   
## n.heptane_mean         4.678e-02  1.019e-01   0.459  0.64720   
## benzene_mean          -1.529e-01  8.498e-02  -1.799  0.07534 . 
## n.octane_mean         -2.564e-01  1.142e-01  -2.246  0.02713 * 
## toluene_mean           2.738e-01  1.058e-01   2.588  0.01123 * 
## ethyl.benzene_mean    -1.207e-01  8.005e-02  -1.508  0.13497   
## m.p.xylene_mean        2.143e-01  1.271e-01   1.685  0.09532 . 
## o.xylene_mean         -5.914e-01  4.537e-01  -1.303  0.19575   
## closest.flare          4.568e-04  5.361e-04   0.852  0.39637   
## weighted.count         1.893e-02  1.701e-02   1.113  0.26866   
## count                 -2.793e-05  2.600e-04  -0.107  0.91471   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                           edf Ref.df    F p-value
## s(as.numeric(datetime)) 1.744  2.278 1.13   0.393
## 
## R-sq.(adj) =  0.716   Deviance explained = 80.2%
## GCV = 0.0020141  Scale est. = 0.0013924  n = 132
plot(daily_v1)

# using leaps and regsubset to search for best predictors
# exhaustive
# not including flares and oil&gas variables
daily_exhaust = regsubsets(
  rd_particle_pCi_mean ~.-rd_particle_B_mean-radon_pCi_mean-radon_B_mean-monthly_oil-monthly_gas-distToLovi_wells, 
  data = loving_daily_noflares, nvmax=30, method = "exhaustive")
reg_summary = summary(daily_exhaust)
par(mfrow = c(2,2))

plot(reg_summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(reg_summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")

# We will now plot a red dot to indicate the model with the largest adjusted R^2 statistic.
# The which.max() function can be used to identify the location of the maximum point of a vector
adj_r2_max = which.max(reg_summary$adjr2) #15

# The points() command works like the plot() command, except that it puts points 
# on a plot that has already been created instead of creating a new plot
points(adj_r2_max, reg_summary$adjr2[adj_r2_max], col ="red", cex = 2, pch = 20)

# We'll do the same for C_p and BIC, this time looking for the models with the SMALLEST statistic
plot(reg_summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
cp_min = which.min(reg_summary$cp) # 9
points(cp_min, reg_summary$cp[cp_min], col = "red", cex = 2, pch = 20)

plot(reg_summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
bic_min = which.min(reg_summary$bic) # 8
points(bic_min, reg_summary$bic[bic_min], col = "red", cex = 2, pch = 20)

# checking coefficients of the variables selected using adjusted R^2
plot(daily_exhaust, scale = "adjr2")

coef(daily_exhaust, 15)
##       (Intercept)           co_mean           o3_mean          wsp_mean 
##     -7.866126e-01      2.150247e-04     -1.503669e-03     -8.404306e-03 
##      co2_ppm_mean          ch4_mean     h2o_sync_mean       ethene_mean 
##      2.001216e-03      8.697424e-05     -3.539654e-02     -1.234194e-02 
## cyclopentane_mean    i.pentane_mean    n.pentane_mean     isoprene_mean 
##      2.104118e-01      2.216484e-02     -3.417823e-02     -1.016855e+00 
##     n.octane_mean      toluene_mean   m.p.xylene_mean     o.xylene_mean 
##     -7.835622e-02      1.078132e-01     -1.741389e-01      5.223981e-01
# checking coefficients of the variables selected using Cp
plot(daily_exhaust, scale = "Cp")

coef(daily_exhaust, 9)
##       (Intercept)           o3_mean          wsp_mean      co2_ppm_mean 
##     -1.015490e+00     -1.328889e-03     -1.079038e-02      2.560781e-03 
##          ch4_mean     h2o_sync_mean       ethene_mean cyclopentane_mean 
##      9.376348e-05     -3.062397e-02     -1.221147e-02      1.821697e-01 
##     n.hexane_mean      toluene_mean 
##     -4.190251e-02      6.349890e-02
# checking coefficients of the variables selected using BIC
plot(daily_exhaust, scale = "bic")

coef(daily_exhaust, 8)
##    (Intercept)        o3_mean       wsp_mean   co2_ppm_mean       ch4_mean 
##  -1.154038e+00  -1.255093e-03  -1.118515e-02   2.910978e-03   8.784447e-05 
##  h2o_sync_mean    ethene_mean n.heptane_mean   toluene_mean 
##  -2.735884e-02  -1.246049e-02  -5.105644e-02   1.064432e-01
# Fitting a Gam on the BIC selected variables
daily_v2 <- mgcv::gam(rd_particle_pCi_mean ~ co_mean + o3_mean + wsp_mean + co2_ppm_mean + ch4_mean + h2o_sync_mean + ethene_mean  + i.pentane_mean + n.pentane_mean  + n.octane_mean + toluene_mean + m.p.xylene_mean + o.xylene_mean, data = loving_daily_noflares)

# View the summary of the GAM model
summary(daily_v2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## rd_particle_pCi_mean ~ co_mean + o3_mean + wsp_mean + co2_ppm_mean + 
##     ch4_mean + h2o_sync_mean + ethene_mean + i.pentane_mean + 
##     n.pentane_mean + n.octane_mean + toluene_mean + m.p.xylene_mean + 
##     o.xylene_mean
## 
## Parametric coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.618e-02  5.279e-01   0.087 0.930437    
## co_mean          3.449e-04  1.615e-04   2.135 0.034813 *  
## o3_mean         -3.343e-03  6.082e-04  -5.496 2.27e-07 ***
## wsp_mean        -1.622e-02  4.568e-03  -3.550 0.000554 ***
## co2_ppm_mean     7.468e-05  1.232e-03   0.061 0.951757    
## ch4_mean         1.102e-04  2.391e-05   4.610 1.03e-05 ***
## h2o_sync_mean   -1.425e-02  8.571e-03  -1.663 0.098967 .  
## ethene_mean     -1.003e-02  7.043e-03  -1.424 0.156991    
## i.pentane_mean   1.158e-02  1.264e-02   0.916 0.361291    
## n.pentane_mean  -1.632e-02  1.142e-02  -1.429 0.155579    
## n.octane_mean   -1.239e-01  6.259e-02  -1.979 0.050097 .  
## toluene_mean     2.202e-01  4.499e-02   4.894 3.16e-06 ***
## m.p.xylene_mean  1.081e-01  9.107e-02   1.187 0.237489    
## o.xylene_mean   -7.793e-01  3.634e-01  -2.145 0.034035 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.671   Deviance explained = 70.4%
## GCV = 0.0018036  Scale est. = 0.0016123  n = 132
# Fit the GAM model with ridge regression
daily_v2 <- mgcv::gam(as.formula(
  paste0(formula_str, "+s(as.numeric(as.Date(datetime)))")
  ), data = loving_daily, method = "REML", select = TRUE, family = gaussian, 
  control = mgcv::gam.control(maxit = 100, trace = TRUE))

# View the summary of the GAM model
summary(daily_v2)
# correlation matrix
cor_matrix = cor(loving_daily_noflares %>% select(-c("radon_B_mean", "rd_particle_B_mean","radon_pCi_mean","rd_particle_pCi_mean",  "datetime", "monthly_oil", "monthly_gas", "distToLovi_wells")) %>% na.omit())
corrplot.mixed(cor_matrix, order="AOE", number.cex = 0.5, tl.pos = 'lt')

Fitting on Hourly

# For hourly

# Define the response variable
response_var <- "rd_particle_pCi"

# Define the names of the columns to exclude
exclude_cols <- c("radon_pCi", "rd_particle_B", "rd_particle_pCi", "radon_B",
                  "t_pump_f", "temp_bb", "rhi", "esf_bb", "distToLovi", "inv_dist",
                  "datetime", "monthly_oil", "monthly_gas", "distToLovi_wells",
                  "time_hourly")

# Generate the formula for the GAM model
formula_str_s <- paste(response_var, "~", paste(paste0("s(", setdiff(names(loving_hourly), exclude_cols), ")"), collapse = "+"))
formula_str <- paste(response_var, "~", paste(setdiff(names(loving_hourly), exclude_cols), collapse = "+"))


# Fit the GAM model 
hourly_v1 <- mgcv::gam(as.formula(
  paste0(formula_str, "+s(as.numeric(datetime))", "+s(as.numeric(substr(time_hourly, 12, 13)))")
  ), data = loving_hourly)

# View the summary of the GAM model
summary(hourly_v1)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## rd_particle_pCi ~ co + no + no2 + nox + o3 + temp_f + pressure_altcorr + 
##     wsp + wdr + relh + rain + solr + co2_ppm + ch4 + h2o_sync + 
##     ethane + ethene + propane + propene + X1_3.butadiene + i.butane + 
##     n.butane + acetylene + cyclopentane + i.pentane + n.pentane + 
##     n.hexane + isoprene + n.heptane + benzene + n.octane + toluene + 
##     ethyl.benzene + m.p.xylene + o.xylene + hour + closest.flare + 
##     weighted.count + count + s(as.numeric(datetime)) + s(as.numeric(substr(time_hourly, 
##     12, 13)))
## 
## Parametric coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.004e+00  1.828e-01  -5.493 4.32e-08 ***
## co                4.258e-04  4.226e-05  10.076  < 2e-16 ***
## no               -4.282e-01  1.269e-01  -3.373 0.000754 ***
## no2              -4.334e-01  1.270e-01  -3.414 0.000650 ***
## nox               4.290e-01  1.269e-01   3.379 0.000737 ***
## o3               -8.641e-04  1.680e-04  -5.143 2.90e-07 ***
## temp_f           -8.716e-04  4.040e-04  -2.157 0.031066 *  
## pressure_altcorr  1.757e-03  6.195e-04   2.836 0.004608 ** 
## wsp              -1.089e-03  9.101e-04  -1.196 0.231745    
## wdr               1.139e-04  2.222e-05   5.126 3.17e-07 ***
## relh             -2.077e-03  2.797e-04  -7.427 1.48e-13 ***
## rain              1.328e-02  6.527e-02   0.203 0.838801    
## solr              3.290e-05  1.237e-05   2.660 0.007852 ** 
## co2_ppm           4.469e-03  2.828e-04  15.804  < 2e-16 ***
## ch4               5.259e-05  4.294e-06  12.246  < 2e-16 ***
## h2o_sync          1.781e-02  8.297e-03   2.146 0.031950 *  
## ethane            3.356e-04  7.293e-05   4.602 4.38e-06 ***
## ethene           -2.317e-03  3.166e-03  -0.732 0.464320    
## propane          -8.649e-05  1.066e-04  -0.812 0.417147    
## propene          -6.952e-02  2.981e-02  -2.332 0.019757 *  
## X1_3.butadiene    5.622e-01  2.176e-01   2.583 0.009833 ** 
## i.butane         -5.949e-04  6.778e-04  -0.878 0.380245    
## n.butane         -9.720e-05  4.674e-04  -0.208 0.835273    
## acetylene         4.057e-03  7.783e-03   0.521 0.602217    
## cyclopentane     -5.389e-02  2.127e-02  -2.534 0.011335 *  
## i.pentane        -2.713e-03  1.800e-03  -1.508 0.131715    
## n.pentane         5.924e-03  2.839e-03   2.087 0.037019 *  
## n.hexane         -3.552e-03  6.430e-03  -0.552 0.580693    
## isoprene          1.120e-01  1.432e-01   0.782 0.434165    
## n.heptane         8.550e-03  1.130e-02   0.757 0.449409    
## benzene          -2.187e-02  8.386e-03  -2.608 0.009164 ** 
## n.octane         -8.600e-02  2.283e-02  -3.768 0.000168 ***
## toluene           5.509e-02  1.037e-02   5.311 1.18e-07 ***
## ethyl.benzene    -7.573e-02  1.448e-02  -5.230 1.83e-07 ***
## m.p.xylene        1.269e-01  2.777e-02   4.568 5.15e-06 ***
## o.xylene         -3.513e-02  6.694e-02  -0.525 0.599789    
## hour             -2.232e-01  4.208e-02  -5.304 1.22e-07 ***
## closest.flare     1.588e-04  2.048e-04   0.775 0.438208    
## weighted.count    3.397e-03  6.338e-03   0.536 0.591977    
## count             9.443e-05  9.724e-05   0.971 0.331600    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                              edf Ref.df     F p-value    
## s(as.numeric(datetime))                    8.864  8.994 43.54  <2e-16 ***
## s(as.numeric(substr(time_hourly, 12, 13))) 7.023  8.098 18.37  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 57/58
## R-sq.(adj) =  0.731   Deviance explained = 73.6%
## GCV = 0.0045911  Scale est. = 0.0044997  n = 2770
# using leaps and regsubset to search for best predictors
# exhaustive
# not including flares and oil&gas variables

# adding a hour variable first
loving_hourly_noflares <- loving_hourly_noflares %>% mutate(hour = as.numeric(substr(time_hourly, 12, 13)))

hourly_exhaust = regsubsets(
  radon_pCi~.-time_hourly -rd_particle_B-rd_particle_pCi-radon_B-monthly_oil-monthly_gas-distToLovi_wells, 
  data = loving_hourly_noflares, nvmax=30, method = "exhaustive")
reg_summary = summary(hourly_exhaust)
par(mfrow = c(2,2))

plot(reg_summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(reg_summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")

# We will now plot a red dot to indicate the model with the largest adjusted R^2 statistic.
# The which.max() function can be used to identify the location of the maximum point of a vector
adj_r2_max = which.max(reg_summary$adjr2) #25

# The points() command works like the plot() command, except that it puts points 
# on a plot that has already been created instead of creating a new plot
points(adj_r2_max, reg_summary$adjr2[adj_r2_max], col ="red", cex = 2, pch = 20)

# We'll do the same for C_p and BIC, this time looking for the models with the SMALLEST statistic
plot(reg_summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
cp_min = which.min(reg_summary$cp) # 20
points(cp_min, reg_summary$cp[cp_min], col = "red", cex = 2, pch = 20)

plot(reg_summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
bic_min = which.min(reg_summary$bic) # 16
points(bic_min, reg_summary$bic[bic_min], col = "red", cex = 2, pch = 20)

# checking coefficients of the variables selected using adjusted R^2
plot(hourly_exhaust, scale = "adjr2")

coef(hourly_exhaust, 25)
##      (Intercept)         datetime               co              no2 
##    -2.492603e+01     7.509371e-04     1.405440e-04    -5.323818e-03 
##              nox           temp_f pressure_altcorr              wsp 
##     7.586770e-04    -5.556106e-03     5.237031e-03    -2.641061e-02 
##              wdr             relh          co2_ppm         h2o_sync 
##     4.765364e-04    -1.841294e-03     1.401657e-02    -2.407520e-02 
##           ethane          propene   X1_3.butadiene         i.butane 
##     8.290471e-04    -3.109479e-01     2.904302e+00     5.668343e-03 
##         n.butane        acetylene     cyclopentane        n.pentane 
##    -3.851351e-03    -7.103973e-02     1.722275e-01    -4.858070e-03 
##         n.hexane          benzene    ethyl.benzene       m.p.xylene 
##    -1.625355e-02     3.802089e-02    -9.486146e-02     2.056836e-01 
##         o.xylene             hour 
##    -3.620882e-01    -2.159373e-03
# checking coefficients of the variables selected using Cp
plot(hourly_exhaust, scale = "Cp")

coef(hourly_exhaust, 20)
##      (Intercept)         datetime              no2              nox 
##    -2.311603e+01     6.524077e-04    -5.304310e-03     9.193150e-04 
##           temp_f pressure_altcorr              wsp              wdr 
##    -6.255999e-03     5.316260e-03    -2.642356e-02     4.969661e-04 
##             relh          co2_ppm           ethane          propene 
##    -2.616477e-03     1.425182e-02     8.758579e-04    -3.086046e-01 
##   X1_3.butadiene         i.butane         n.butane        acetylene 
##     2.769437e+00     5.300563e-03    -4.218410e-03    -6.892481e-02 
##     cyclopentane         n.hexane        n.heptane          benzene 
##     1.553230e-01    -3.506564e-02     4.037811e-02     4.005663e-02 
##             hour 
##    -2.116066e-03
# checking coefficients of the variables selected using adjusted R^2
plot(hourly_exhaust, scale = "bic")

coef(hourly_exhaust, 16)
##      (Intercept)         datetime              no2           temp_f 
##    -2.282576e+01     6.382371e-04    -3.510055e-03    -6.145565e-03 
## pressure_altcorr              wsp              wdr             relh 
##     5.269061e-03    -2.598156e-02     5.024433e-04    -2.553147e-03 
##          co2_ppm           ethane          propene   X1_3.butadiene 
##     1.430202e-02     9.307781e-04    -3.153754e-01     2.781602e+00 
##         i.butane         n.butane        acetylene          benzene 
##     3.971486e-03    -3.536060e-03    -7.738726e-02     6.578672e-02 
##             hour 
##    -2.165885e-03
# Fitting a Gam on the BIC selected variables
hourly_v2 <- mgcv::gam(radon_pCi~ s(as.numeric(datetime)) + no2 + temp_f + pressure_altcorr + wsp + wdr + relh + co2_ppm + ethane + propene + X1_3.butadiene + n.butane + acetylene + benzene +s(hour), data = loving_hourly_noflares)

# View the summary of the GAM model
summary(hourly_v2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## radon_pCi ~ s(as.numeric(datetime)) + no2 + temp_f + pressure_altcorr + 
##     wsp + wdr + relh + co2_ppm + ethane + propene + X1_3.butadiene + 
##     n.butane + acetylene + benzene + s(hour)
## 
## Parametric coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.019e+01  1.296e+00  -7.864 5.16e-15 ***
## no2              -1.015e-03  7.927e-04  -1.280 0.200645    
## temp_f           -7.077e-03  7.757e-04  -9.123  < 2e-16 ***
## pressure_altcorr  6.992e-03  1.214e-03   5.760 9.26e-09 ***
## wsp              -2.550e-02  1.860e-03 -13.706  < 2e-16 ***
## wdr               5.575e-04  4.508e-05  12.367  < 2e-16 ***
## relh             -3.898e-03  2.796e-04 -13.938  < 2e-16 ***
## co2_ppm           9.964e-03  5.543e-04  17.974  < 2e-16 ***
## ethane            8.467e-04  1.065e-04   7.949 2.65e-15 ***
## propene          -2.243e-01  4.852e-02  -4.622 3.96e-06 ***
## X1_3.butadiene    2.167e+00  4.143e-01   5.230 1.81e-07 ***
## n.butane         -7.977e-04  2.150e-04  -3.710 0.000211 ***
## acetylene        -6.175e-02  1.560e-02  -3.959 7.70e-05 ***
## benzene           2.540e-02  7.996e-03   3.177 0.001505 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                           edf Ref.df      F p-value    
## s(as.numeric(datetime)) 8.862  8.994 20.663  <2e-16 ***
## s(hour)                 6.605  7.768  8.651  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.608   Deviance explained = 61.1%
## GCV = 0.022872  Scale est. = 0.022646  n = 2977
plot(hourly_v2)

Exhaustive Search and removing correlated predictors

# correlation matrix
M_hourly = cor(loving_hourly_noflares %>% select(-c( "datetime", "monthly_oil", "monthly_gas", "distToLovi_wells", 
                                                     "radon_B", "rd_particle_B", "radon_pCi","time_hourly")) %>% na.omit())
corrplot.mixed(M_hourly, order="AOE", number.cex = 0.5, tl.pos = "lt")

remaining_predictors <- setdiff(colnames(M_hourly), "rd_particle_pCi")
for (var_name in setdiff(colnames(M_hourly), "rd_particle_pCi")) {
  # skip the variable if it already has been removed
  if (!(var_name %in% remaining_predictors)){
    next
  }
  # find variables above threshold and return their correlation with response
  top_cor_vars <- correlated_above_threshold(M_hourly, var_name, 0.7, "rd_particle_pCi")
  
  # only keep the variables that are still in remaining_predictors
  top_cor_vars$variables <- intersect(top_cor_vars$variable, remaining_predictors)
  top_cor_vars$scores <- top_cor_vars$scores[top_cor_vars$variable]
  # if var_name is not correlated with any other variable above threshold
  if (length(top_cor_vars$variables) == 1){
    next
  }
  
  best <- names(which.max(abs(top_cor_vars$scores)))
  to_remove <- setdiff(top_cor_vars$variables, best)
  
  # print(cat("varname", var_name))
  # print(cat("kicking out", to_remove))
  # remove the to_remove variables from remaining_predictors
  remaining_predictors <- setdiff(remaining_predictors, to_remove)
}
remaining_predictors
##  [1] "co"               "temp_f"           "pressure_altcorr" "wsp"             
##  [5] "wdr"              "relh"             "rain"             "solr"            
##  [9] "ch4"              "ethene"           "X1_3.butadiene"   "isoprene"        
## [13] "m.p.xylene"       "hour"             "weighted.count"
M_hourly_2 <- cor(loving_hourly_noflares %>% select("rd_particle_pCi", "co",  "temp_f", "pressure_altcorr", "wsp", "wdr", "relh","rain", "solr","ch4", "ethene", "X1_3.butadiene", "isoprene" , "m.p.xylene", "hour",  "weighted.count") %>% na.omit())
corrplot.mixed(M_hourly_2, number.cex = 0.5, tl.pos = "lt")

# fit a Gam on that
hourly_no_cor <- mgcv::gam(rd_particle_pCi ~ co + temp_f + pressure_altcorr + wsp + wdr + relh + rain + ch4 + ethene + X1_3.butadiene + isoprene + m.p.xylene + hour + weighted.count, data = loving_hourly_noflares)
summary(hourly_no_cor)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## rd_particle_pCi ~ co + temp_f + pressure_altcorr + wsp + wdr + 
##     relh + rain + ch4 + ethene + X1_3.butadiene + isoprene + 
##     m.p.xylene + hour + weighted.count
## 
## Parametric coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -3.556e+00  5.016e-01  -7.090 1.67e-12 ***
## co                2.588e-04  4.173e-05   6.202 6.34e-10 ***
## temp_f            1.617e-04  1.713e-04   0.944   0.3451    
## pressure_altcorr  3.458e-03  4.928e-04   7.017 2.79e-12 ***
## wsp              -8.217e-03  9.680e-04  -8.489  < 2e-16 ***
## wdr               3.310e-04  2.302e-05  14.382  < 2e-16 ***
## relh             -2.085e-04  1.091e-04  -1.911   0.0561 .  
## rain             -8.990e-02  8.033e-02  -1.119   0.2632    
## ch4               7.715e-05  3.416e-06  22.585  < 2e-16 ***
## ethene            4.212e-03  2.752e-03   1.531   0.1260    
## X1_3.butadiene    2.609e-01  1.255e-01   2.080   0.0376 *  
## isoprene          1.388e-01  1.301e-01   1.068   0.2858    
## m.p.xylene        3.009e-02  5.021e-03   5.994 2.30e-09 ***
## hour             -2.975e-03  2.548e-04 -11.675  < 2e-16 ***
## weighted.count   -1.211e-03  1.360e-03  -0.890   0.3734    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.556   Deviance explained = 55.8%
## GCV = 0.007133  Scale est. = 0.0070973  n = 2991